Simple example which will:
To learn more about the data used in the experiment, see here: https://github.ibm.com/rchakravarti/rnr-debugging-scripts/tree/master/resources/insurance_lib_v2
Note: Ensure credentials have been updated in config/config.ini
In [1]:
import sys
from os import path, getcwd
import json
from tempfile import mkdtemp
import glob
sys.path.extend([path.abspath(path.join(getcwd(), path.pardir))])
from rnr_debug_helpers.utils.discovery_wrappers import DiscoveryProxy
from rnr_debug_helpers.utils.io_helpers import load_config, smart_file_open, \
RankerRelevanceFileQueryStream, initialize_query_stream, insert_modifier_in_filename, PredictionReader
from rnr_debug_helpers.create_cross_validation_splits import split_file_into_train_and_test
from rnr_debug_helpers.compute_ranking_stats import compute_performance_stats
config_file_path = path.abspath(path.join(getcwd(), path.pardir, 'config', 'config.ini'))
print('Using config from {}'.format(config_file_path))
config = load_config(config_file_path=config_file_path)
insurance_lib_data_dir = path.abspath(path.join(getcwd(), path.pardir, 'resources', 'insurance_lib_v2'))
print('Using data from {}'.format(insurance_lib_data_dir))
In [2]:
# Either re-use an existing collection id by over riding the below, or leave as is to create one
collection_id = '96b27926-7d48-47dc-a0fd-2f7333e3d6e4'
discovery = DiscoveryProxy(config)
collection_id = discovery.setup_collection(collection_id=collection_id,
config_file=path.join(insurance_lib_data_dir,
'discovery_config.json'))
We use the same InsuranceLibV2 corpus file that had been pre-processed and formatted into the Solr format for adding documents. Discovery essentially expects the document in a json format consisting of {field name 1
:field value 1
...}. So we iterate over the docs in the Solr format XML file and upload the file iteratively.
Since Discovery doesn't (yet) have bulk upload, we speed up the uploads via multi-processing to speed it up. This seems to misbehave in a python notebook, so run the actual upload in a regular python script.
In [3]:
from xml.etree import ElementTree as ET
def _parse_doc_elements(element):
"""
Parses a single document element from the Solr format xml element into a format Discovery can understand
:param element:
:return:
"""
doc_id, body = None, None
for field in element.findall("field"):
if field.attrib['name'] == 'id':
doc_id = field.text
elif field.attrib['name'] == 'body':
body = field.text
if doc_id is None or body is None:
raise ValueError('Unable to parse id and body from xml entry: %s' % element)
return doc_id, {'body': body}
def document_corpus_as_iterable(corpus):
stats = defaultdict(int)
with smart_file_open(corpus) as infile:
for event, element in ET.iterparse(infile):
if event == 'end' and element.tag == 'doc':
stats['num_xml_entries'] += 1
yield _parse_doc_elements(element)
# This thing seems to misbehave when run from python notebooks due to its use of multiprocessing, so just run in a script
# discovery.upload_documents(collection_id=collection_id,
# corpus=document_corpus_as_iterable(
# path.join(insurance_lib_data_dir, 'document_corpus.solr.xml')))
discovery.print_collection_stats(collection_id)
First we generate a train and test split so that we can compare with the performance of Discovery after we add training data.
Note: Since we do not have control over training the same way we did with RetrieveAndRank, it is difficult to use the cross-validation splits for evaluation as we did in 2.0 - Evaluate RnR Performance. Instead, as soon as training data is uploaded to the service, it gets consumed by Discovery (with some latency).
In [3]:
experimental_directory = mkdtemp()
with smart_file_open(path.join(insurance_lib_data_dir, 'validation_gt_relevance_file.csv')) as infile:
split_file_into_train_and_test(initialize_query_stream(infile, file_format='relevance_file'),
experimental_directory, train_percentage=0.80)
print('\nCreated train and validation splits in directory: {}'.format(experimental_directory))
for filename in glob.glob('{}/*.csv'.format(experimental_directory), recursive=True):
print(filename)
Now we generate predictions for each fold and evaluate the performance
In [7]:
rows=100
ndcg_evaluated_at = 50
test_set = path.join(experimental_directory, 'validation.relevance_file.csv')
prediction_file = insert_modifier_in_filename(test_set,'discovery_predictions','txt')
with smart_file_open(test_set) as infile:
# generate predictions
labelled_test_questions = RankerRelevanceFileQueryStream(infile)
json.dump(discovery.generate_natural_language_prediction_scores(
test_questions=labelled_test_questions, num_rows=rows,
prediction_file_location=prediction_file, collection_id=collection_id), sys.stdout, sort_keys=True, indent=4)
# score them
labelled_test_questions.reset()
with smart_file_open(prediction_file) as preds_file:
prediction_reader = PredictionReader(preds_file)
stats, _ = compute_performance_stats(prediction_reader=prediction_reader,
ground_truth_query_stream=labelled_test_questions,
k=ndcg_evaluated_at)
print('\nTest Performance')
json.dump(stats, sys.stdout, sort_keys=True, indent=4)